Importing modules and data¶

In [1]:
# importing relevant modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None
In [2]:
# importing data as csv, assigning to DataFrame
# data source: https://figshare.com/articles/dataset/TetraDENSITY_Population_Density_dataset/5371633?file=20334360
animals = pd.read_csv('TetraDENSITY_v.1.csv')

Inspecting data¶

In [3]:
# rows and columns
animals.shape
Out[3]:
(18246, 19)
In [4]:
# info on each column
animals.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18246 entries, 0 to 18245
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Class               18246 non-null  object 
 1   Order               18246 non-null  object 
 2   Family              18246 non-null  object 
 3   Genus               18246 non-null  object 
 4   Species             18246 non-null  object 
 5   Subspecies          827 non-null    object 
 6   Longitude           18245 non-null  float64
 7   Latitude            18245 non-null  float64
 8   Locality            15092 non-null  object 
 9   Country             18246 non-null  object 
 10  Year                17127 non-null  object 
 11  Season/Month        9911 non-null   object 
 12  Habitat             8856 non-null   object 
 13  Sampling_Area       11085 non-null  float64
 14  Sampling_Area_unit  11085 non-null  object 
 15  Density             18246 non-null  float64
 16  Density_unit        18246 non-null  object 
 17  Sampling_Method     15454 non-null  object 
 18  Method_info         9619 non-null   object 
dtypes: float64(4), object(15)
memory usage: 2.6+ MB
In [5]:
# first 5 rows
animals.head()
Out[5]:
Class Order Family Genus Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit Sampling_Method Method_info
0 Amphibia Anura Brachycephalidae Brachycephalus didactylus NaN -44.200000 -23.183 Vila Dois Rios (Ilha Grande) Brazil 1997 January – May Atlantic rainforest 0.0064 ha 52.0 ind/ha Incomplete_counts 24 plots 8x8m
1 Amphibia Anura Brachycephalidae Brachycephalus didactylus NaN -44.200000 -23.183 Vila Dois Rios (Ilha Grande) Brazil 1997 January – May Atlantic rainforest 0.0002 ha 1778.0 ind/ha Incomplete_counts 90 plots 2x1m and litter removal method
2 Amphibia Anura Brachycephalidae Brachycephalus didactylus NaN -42.583000 -22.417 Fazenda Santa Bárbara in the Parque Estadual d... Brazil 2006 Late October - Early November Atlantic rainforest 0.0025 ha 400.0 ind/ha Incomplete_counts 25 plots 5x5m
3 Amphibia Anura Brachycephalidae Brachycephalus hermogenesi NaN -48.266667 -25.150 Reserva Particular do Patrimônio Natural Salto... Brazil 2009-2010 Summer Rainforest 0.2560 ha 16.0 ind/ha Incomplete_counts Counts in plots
4 Amphibia Anura Brachycephalidae Brachycephalus hermogenesi NaN -48.266667 -25.150 Reserva Particular do Patrimônio Natural Salto... Brazil 2009-2010 Autumn Rainforest 0.2560 ha 16.0 ind/ha Incomplete_counts Counts in plots

Cleaning data¶

In [6]:
# dropping unnecessary columns
animals = animals.drop(['Sampling_Method', 'Method_info'], axis = 1)
In [7]:
# checking area units
animals['Sampling_Area_unit'].unique()
Out[7]:
array(['ha', nan, 'km2'], dtype=object)
In [8]:
# defining function to convert area from hectares to km2
def ha_to_km2(ha):
    return ha * 0.01
In [9]:
# converting areas from hectares to km2, and changing units from 'ha' to 'km2'
for ind in animals.index:
    if animals['Sampling_Area_unit'][ind] == 'ha':
        animals['Sampling_Area'][ind] = ha_to_km2(animals['Sampling_Area'][ind])
        animals['Sampling_Area_unit'][ind] = 'km2'
In [10]:
animals['Sampling_Area_unit'].unique()
Out[10]:
array(['km2', nan], dtype=object)
In [11]:
# checking density units
animals['Density_unit'].unique()
Out[11]:
array(['ind/ha', 'males/ha', 'pairs/km2', 'ind/km2'], dtype=object)
In [12]:
# converting density from ind/ha to ind/km2 
for ind in animals.index:
    if animals['Density_unit'][ind] == 'ind/ha':
        animals['Density'][ind] = ha_to_km2(animals['Density'][ind])
        animals['Density_unit'][ind] = 'ind/km2'
In [13]:
# converting density from males/ha to males/km2
for ind in animals.index:
    if animals['Density_unit'][ind] == 'males/ha':
        animals['Density'][ind] = ha_to_km2(animals['Density'][ind])
        animals['Density_unit'][ind] = 'males/km2'
In [14]:
animals['Density_unit'].unique()
Out[14]:
array(['ind/km2', 'males/km2', 'pairs/km2'], dtype=object)
In [15]:
# converting all to ind/km2 - assuming equal numbers of male and female animals
for ind in animals.index:
    if animals['Density_unit'][ind] == 'males/km2':
        animals['Density'][ind] = (animals['Density'][ind]) * 2
        animals['Density_unit'][ind] = 'ind/km2'

for ind in animals.index:
    if animals['Density_unit'][ind] == 'pairs/km2':
        animals['Density'][ind] = (animals['Density'][ind]) * 2
        animals['Density_unit'][ind] = 'ind/km2'
In [16]:
animals['Density_unit'].unique()
Out[16]:
array(['ind/km2'], dtype=object)

Exploring data¶

In [17]:
# viewing random sample of 10 rows
animals.sample(10)
Out[17]:
Class Order Family Genus Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit
5958 Aves Passeriformes Pomatostomidae Pomatostomus temporalis NaN 132.100000 -14.13000 Munmarlary Australia 1987 May Tropical forest (Unburnt) 0.1000 km2 18.000000 ind/km2
12474 Mammalia Cetartiodactyla Cervidae Rusa unicolor NaN 101.370000 14.44000 NaN Thailand NaN NaN NaN NaN NaN 13.000000 ind/km2
6780 Aves Passeriformes Sylviidae Sylvia atricapilla NaN 23.700000 52.70000 Białowieża National Park Poland 2003 year-round oak-hornbeam-lime forest 0.2400 km2 112.500000 ind/km2
11208 Mammalia Cetartiodactyla Bovidae Raphicerus campestris NaN 20.600000 -18.80000 Kaudom GP Namibia 1988 NaN NaN 3841.0000 km2 0.007810 ind/km2
17999 Reptilia Squamata Scincidae Emoia atrocostata NaN 123.200000 9.40000 Polo (Negros Island) Philippines 1964 July-November 1964 Mangrove forest 0.0359 km2 1.186630 ind/km2
17836 Reptilia Squamata Lacertidae Zootoca vivipara NaN 6.138000 51.54700 de Hamert reserve Netherlands 1981 NaN NaN 0.0120 km2 0.940000 ind/km2
9234 Mammalia Carnivora Canidae Cuon alpinus NaN 80.560190 22.29183 NaN India 1996 NaN NaN 940.0000 km2 0.323404 ind/km2
758 Aves Anseriformes Anatidae Cairina moschata NaN -62.466000 -4.33300 Terra Firme Brazil 2002-2003 NaN Upland Forest 4.5000 km2 1.150000 ind/km2
14887 Mammalia Primates Hominidae Gorilla gorilla NaN 14.583333 1.10000 Mbomo Congo 1989-1990 NaN NaN 1.9100 km2 0.600000 ind/km2
4573 Aves Passeriformes Motacillidae Anthus trivialis NaN 23.700000 52.70000 Białowieża National Park Poland 2000 year-round ash-alder forest 0.3300 km2 6.061000 ind/km2
In [18]:
# setting figure size to default
plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
In [19]:
count_plot = sns.countplot(x = 'Class', data = animals)
In [20]:
plt.rcParams["figure.figsize"] = (10, 2)
country_count = animals['Country'].value_counts(sort = True).head(10)
country_count.plot(kind = 'bar', title = 'Top 10 countries by row count')
Out[20]:
<AxesSubplot: title={'center': 'Top 10 countries by row count'}>
In [21]:
plt.rcParams["figure.figsize"] = (15, 5)
year_count = animals['Year'].value_counts(sort = True).head(80).sort_index()
year_count.plot(kind = 'bar', title = 'Row counts by year')
Out[21]:
<AxesSubplot: title={'center': 'Row counts by year'}>
In [22]:
# top 10 families by row count
animals['Family'].value_counts(sort = True).head(10)
Out[22]:
Bovidae            2124
Cercopithecidae     726
Sylviidae           564
Fringillidae        537
Paridae             523
Cervidae            478
Elephantidae        431
Muscicapidae        422
Cricetidae          418
Felidae             400
Name: Family, dtype: int64

Analysing Białowieża National Park birds¶

In [23]:
animals[animals['Locality'] == 'Białowieża National Park'].groupby('Genus')['Genus'].value_counts().sort_values(ascending = False).head(10)
Out[23]:
Genus         Genus       
Parus         Parus           159
Dendrocopos   Dendrocopos     108
Ficedula      Ficedula         86
Turdus        Turdus           75
Phylloscopus  Phylloscopus     70
Regulus       Regulus          50
Columba       Columba          44
Sylvia        Sylvia           36
Prunella      Prunella         35
Fringilla     Fringilla        35
Name: Genus, dtype: int64
In [24]:
# creating new dataset containing Białowieża National Park 'paruses'
parus = animals[(animals['Locality'] == 'Białowieża National Park')&(animals['Genus'] == 'Parus')].drop(animals.iloc[:, 0:4], axis = 1)
parus.sample(5)
Out[24]:
Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit
5437 cristatus NaN 23.7 52.7 Białowieża National Park Poland 2004 year-round pine-bilberry 0.250 km2 32.000 ind/km2
5532 major NaN 23.7 52.7 Białowieża National Park Poland 2001 year-round oak-hornbeam-lime forest 0.300 km2 126.667 ind/km2
5555 major NaN 23.7 52.7 Białowieża National Park Poland 2004 year-round ash-alder forest 0.330 km2 81.818 ind/km2
5354 caeruleus NaN 23.7 52.7 Białowieża National Park Poland 2003 year-round oak-hornbeam-lime forest 0.255 km2 68.627 ind/km2
5248 ater NaN 23.7 52.7 Białowieża National Park Poland 2004 year-round oak-hornbeam-lime forest 0.300 km2 16.667 ind/km2
In [25]:
# checking parus species
parus['Species'].unique()
Out[25]:
array(['ater', 'caeruleus', 'cristatus', 'major', 'montanus', 'palustris'],
      dtype=object)
In [26]:
# species densities in oak-hornbeam lime forest

ater_oak = parus[(parus['Species'] == 'ater')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
ater_oak.plot(kind = 'line', x = 'Year', y = 'Density', title = 'Parus species population densities in oak-hornbeam lime forest', legend = True, label = 'ater')

caeruleus_oak = parus[(parus['Species'] == 'caeruleus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
caeruleus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'caeruleus')

cristatus_oak = parus[(parus['Species'] == 'cristatus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
cristatus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'cristatus')

major_oak = parus[(parus['Species'] == 'major')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
major_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'major')

montanus_oak = parus[(parus['Species'] == 'montanus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
montanus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'montanus')

palustris_oak = parus[(parus['Species'] == 'palustris')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
palustris_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'palustris')
Out[26]:
<AxesSubplot: title={'center': 'Parus species population densities in oak-hornbeam lime forest'}, xlabel='Year'>
In [27]:
# using a for loop to do the same thing - much less code!

species = parus['Species'].unique()

for specie in species:
    specie_data = parus[(parus['Species'] == specie)&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
    specie_data.plot(kind = 'line', x = 'Year', y = 'Density', title = 'Parus species population densities in oak-hornbeam lime forest', legend = True, label = specie)

Analysing cats¶

In [28]:
# creating new dataset containing only cats
cats = animals[animals['Family'] == 'Felidae']
In [29]:
sns.countplot(x = 'Genus', data = cats)
Out[29]:
<AxesSubplot: xlabel='Genus', ylabel='count'>
In [30]:
plt.rcParams["figure.figsize"] = (10, 3)
# creating new dataset containing only big cats
big_cats = cats[cats['Genus'] == 'Panthera']
big_cats_count_plot = sns.countplot(x = 'Species', data = big_cats)

Analysing big cats¶

In [31]:
# viewing random sample of 10 rows
big_cats.sample(10)
Out[31]:
Class Order Family Genus Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit
9464 Mammalia Carnivora Felidae Panthera pardus NaN 35.57418 -3.17037 NaN Tanzania 1988 NaN NaN 260.0 km2 0.076923 ind/km2
9439 Mammalia Carnivora Felidae Panthera onca NaN -53.70000 -26.60000 Green Corridor I Brazil 2003-2014 NaN NaN NaN NaN 0.009100 ind/km2
9659 Mammalia Carnivora Felidae Panthera tigris NaN 99.17000 15.42000 Huai Kha Khaeng Wildlife Sanctuary Thailand 2005 NaN NaN 2780.0 km2 0.018201 ind/km2
9389 Mammalia Carnivora Felidae Panthera leo NaN 35.57418 -3.17037 NaN Tanzania 1988 NaN NaN 260.0 km2 0.348740 ind/km2
9414 Mammalia Carnivora Felidae Panthera leo NaN 35.00000 -1.00000 Ol Kinyei Kenya 2014 August-October NaN NaN NaN 0.225000 ind/km2
9549 Mammalia Carnivora Felidae Panthera pardus NaN 80.56019 22.29183 NaN India 1998 NaN NaN 110.0 km2 0.090909 ind/km2
9545 Mammalia Carnivora Felidae Panthera pardus NaN 80.41137 23.61895 NaN India 1998 NaN NaN 449.0 km2 0.060134 ind/km2
9601 Mammalia Carnivora Felidae Panthera tigris NaN 79.47063 21.61774 NaN India 1993 NaN NaN 758.0 km2 0.036939 ind/km2
9614 Mammalia Carnivora Felidae Panthera tigris NaN 78.93506 29.53330 NaN India 1995 NaN NaN 1319.0 km2 0.101592 ind/km2
9477 Mammalia Carnivora Felidae Panthera pardus NaN 76.43633 27.31565 NaN India 1991 NaN NaN 866.0 km2 0.032333 ind/km2
In [32]:
# removing unnecessary columns
big_cats = big_cats.drop(big_cats.iloc[:, 0:4], axis = 1)
big_cats.head(5)
Out[32]:
Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit
9368 leo NaN 15.78200 -19.01763 NaN Namibia 1926 NaN NaN 74000.0 km2 0.002838 ind/km2
9369 leo NaN 36.11150 -4.15202 NaN Tanzania 1962 NaN NaN 1683.0 km2 0.008913 ind/km2
9370 leo NaN 36.11150 -4.15202 NaN Tanzania 1962 NaN NaN 1683.0 km2 0.035651 ind/km2
9371 leo NaN 31.58213 -23.98913 NaN South Africa 1964 NaN NaN 18989.0 km2 0.058982 ind/km2
9372 leo NaN 35.57418 -3.17037 NaN Tanzania 1965 NaN NaN 260.0 km2 0.115385 ind/km2
In [33]:
# counts by year
plt.rcParams["figure.figsize"] = (15, 5)
year_count = big_cats['Year'].value_counts(sort = True).head(80).sort_index()
year_count.plot(kind = 'bar', title = 'Row counts by year')
Out[33]:
<AxesSubplot: title={'center': 'Row counts by year'}>
In [34]:
# visualising using plotly
import plotly.express as px

fig = px.scatter_geo(big_cats, lat = 'Latitude', lon = 'Longitude', hover_name = "Year", color = 'Species')
fig.update_layout(title = 'Big Cat Population Locations', title_x = 0.5)
fig.show()
SpeciesleooncapardustigrisBig Cat Population Locations
plotly-logomark
In [35]:
# Thailand tiger population density over time
fig, ax = plt.subplots(figsize = (7, 3))
thai_tigers = big_cats[(big_cats['Species'] == 'tigris')&(big_cats['Locality'] == 'Huai Kha Khaeng Wildlife Sanctuary')]
thai_tigers.plot(ax = ax, kind = 'line', x = 'Year', y = 'Density', title = 'Tiger Population Density - Huai Kha Khaeng Wildlife Sanctuary', legend = [])
Out[35]:
<AxesSubplot: title={'center': 'Tiger Population Density - Huai Kha Khaeng Wildlife Sanctuary'}, xlabel='Year'>
In [36]:
# focussing on jaguars, 2003-14
jaguars_0314 = big_cats[(big_cats['Species'] == 'onca') & (big_cats['Year'] == '2003-2014')]
jaguars_0314
Out[36]:
Species Subspecies Longitude Latitude Locality Country Year Season/Month Habitat Sampling_Area Sampling_Area_unit Density Density_unit
9431 onca NaN -53.8 -22.30 Ivinhema Brazil 2003-2014 NaN NaN NaN NaN 0.0166 ind/km2
9432 onca NaN -40.3 -19.20 Vale NR I Brazil 2003-2014 NaN NaN NaN NaN 0.0242 ind/km2
9433 onca NaN -53.7 -26.60 Green Corridor II Brazil 2003-2014 NaN NaN NaN NaN 0.0107 ind/km2
9434 onca NaN -54.5 -25.60 Iguazú-San Jorge Brazil 2003-2014 NaN NaN NaN NaN 0.0120 ind/km2
9435 onca NaN -54.2 -25.85 Iguazú-Urugua-í Argentina 2003-2014 NaN NaN NaN NaN 0.0089 ind/km2
9436 onca NaN -52.3 -22.50 Morro do Diabo Brazil 2003-2014 NaN NaN NaN NaN 0.0239 ind/km2
9437 onca NaN -48.2 -24.60 Intervales-PETAR Brazil 2003-2014 NaN NaN NaN NaN 0.0066 ind/km2
9438 onca NaN -54.8 -25.00 Mbaracayú Paraguay 2003-2014 NaN NaN NaN NaN 0.0129 ind/km2
9439 onca NaN -53.7 -26.60 Green Corridor I Brazil 2003-2014 NaN NaN NaN NaN 0.0091 ind/km2
In [37]:
# visualising using plotly
fig = px.scatter_geo(jaguars_0314, lat = 'Latitude', lon = 'Longitude', hover_name = "Locality", color = 'Density',color_continuous_scale = ['green', 'red'])
fig.update_layout(title = 'Jaguar Population Density 2003-14', title_x = 0.5)
fig.show()
0.0080.010.0120.0140.0160.0180.020.0220.024DensityJaguar Population Density 2003-14
plotly-logomark
In [38]:
# visualising using geopandas
import geopandas as gpd
import descartes
from shapely.geometry import Point, Polygon
In [39]:
# had to download a shape file (.shp) of South America
jaguar_map = gpd.read_file('C:/Users/sypak/Downloads/data (1).zip')
In [44]:
# set coordinate reference system
crs = {'init':'epsg:4326'}

# convert longitude and latitude to coordinates
geometry = [Point(xy) for xy in zip(jaguars_0314['Longitude'], jaguars_0314['Latitude'])]
# store a new version of jaguars_0314 as a GeoDataFrame
geo_df = gpd.GeoDataFrame(jaguars_0314, crs = crs, geometry = geometry)
C:\Users\sypak\AppData\Local\Programs\Python\Python310\lib\site-packages\pyproj\crs\crs.py:141: FutureWarning:

'+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6

In [41]:
fig, ax = plt.subplots(figsize = (5, 3))
jaguar_map.boundary.plot(ax = ax, color = 'gray')
geo_df.plot(ax = ax, column = 'Density', alpha = 0.9, legend = True, legend_kwds = {'label': "Population Density"}, cmap = 'OrRd')
ax.set_axis_off()
ax.set_title('Jaguar Population Density 2003-14')
Out[41]:
Text(0.5, 1.0, 'Jaguar Population Density 2003-14')